In [315]:
import pandas as pd
import numpy as np
import matplotlib as plt
import matplotlib.pyplot as plt
import plotly.express as px # for making it interactive
from sklearn.linear_model import LinearRegression # for prediction of the next month revenue
import seaborn as sns
In [316]:
data0=pd.read_csv('ORIGNAL SUPERMARKET_DATA.csv')
In [317]:
data0.head()
Out[317]:
| Invoice ID | Branch | City | Customer type | Gender | Product line | Unit price | Quantity | Tax 5% | Total | Date | Time | Payment | cogs | gross margin percentage | gross income | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 750-67-8428 | A | Yangon | Member | Female | Health and beauty | 74.69 | 7 | 26.1415 | 548.9715 | 1/5/2019 | 13:08 | Ewallet | 522.83 | 4.761905 | 26.1415 | 9.1 |
| 1 | 226-31-3081 | C | Naypyitaw | Normal | Female | Electronic accessories | 15.28 | 5 | 3.8200 | 80.2200 | 3/8/2019 | 10:29 | Cash | 76.40 | 4.761905 | 3.8200 | 9.6 |
| 2 | 631-41-3108 | A | Yangon | Normal | Male | Home and lifestyle | 46.33 | 7 | 16.2155 | 340.5255 | 3/3/2019 | 13:23 | Credit card | 324.31 | 4.761905 | 16.2155 | 7.4 |
| 3 | 123-19-1176 | A | Yangon | Member | Male | Health and beauty | 58.22 | 8 | 23.2880 | 489.0480 | 1/27/2019 | 20:33 | Ewallet | 465.76 | 4.761905 | 23.2880 | 8.4 |
| 4 | 373-73-7910 | A | Yangon | Normal | Male | Sports and travel | 86.31 | 7 | 30.2085 | 634.3785 | 2/8/2019 | 10:37 | Ewallet | 604.17 | 4.761905 | 30.2085 | 5.3 |
In [318]:
data0.columns
Out[318]:
Index(['Invoice ID', 'Branch', 'City', 'Customer type', 'Gender',
'Product line', 'Unit price', 'Quantity', 'Tax 5%', 'Total', 'Date',
'Time', 'Payment', 'cogs', 'gross margin percentage', 'gross income',
'Rating'],
dtype='object')
In [319]:
### Added new columns to the dataset.csv to keep the orignal data unchanged
In [320]:
# data['Date(dd-mm-yyyy)']=pd.to_datetime(data['Date(dd-mm-yyyy)'] , format='%d-%m-%Y' )
# data['Day']=data['Date(dd-mm-yyyy)'].dt.day_name()
In [321]:
data= pd.read_csv('dataset.csv')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0.5 1000 non-null int64 1 Unnamed: 0.4 1000 non-null int64 2 Unnamed: 0.3 1000 non-null int64 3 Unnamed: 0.2 1000 non-null int64 4 Unnamed: 0.1 1000 non-null int64 5 Unnamed: 0 1000 non-null int64 6 Invoice ID 1000 non-null object 7 Branch 1000 non-null object 8 City 1000 non-null object 9 Customer type 1000 non-null object 10 Gender 1000 non-null object 11 Product line 1000 non-null object 12 Unit price 1000 non-null float64 13 Quantity 1000 non-null int64 14 Tax 5% 1000 non-null float64 15 Total 1000 non-null float64 16 Time 1000 non-null object 17 Payment 1000 non-null object 18 cogs 1000 non-null float64 19 gross margin percentage 1000 non-null float64 20 gross income 1000 non-null float64 21 Rating 1000 non-null float64 22 Day 1000 non-null object 23 Date(dd-mm-yyyy) 1000 non-null object 24 Month 1000 non-null object dtypes: float64(7), int64(7), object(11) memory usage: 195.4+ KB
In [ ]:
In [322]:
data.describe()
Out[322]:
| Unnamed: 0.5 | Unnamed: 0.4 | Unnamed: 0.3 | Unnamed: 0.2 | Unnamed: 0.1 | Unnamed: 0 | Unit price | Quantity | Tax 5% | Total | cogs | gross margin percentage | gross income | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.00000 | 1000.000000 | 1000.000000 | 1000.00000 |
| mean | 499.500000 | 499.500000 | 499.500000 | 499.500000 | 499.500000 | 499.500000 | 55.672130 | 5.510000 | 15.379369 | 322.966749 | 307.58738 | 4.761905 | 15.379369 | 6.97270 |
| std | 288.819436 | 288.819436 | 288.819436 | 288.819436 | 288.819436 | 288.819436 | 26.494628 | 2.923431 | 11.708825 | 245.885335 | 234.17651 | 0.000000 | 11.708825 | 1.71858 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 10.080000 | 1.000000 | 0.508500 | 10.678500 | 10.17000 | 4.761905 | 0.508500 | 4.00000 |
| 25% | 249.750000 | 249.750000 | 249.750000 | 249.750000 | 249.750000 | 249.750000 | 32.875000 | 3.000000 | 5.924875 | 124.422375 | 118.49750 | 4.761905 | 5.924875 | 5.50000 |
| 50% | 499.500000 | 499.500000 | 499.500000 | 499.500000 | 499.500000 | 499.500000 | 55.230000 | 5.000000 | 12.088000 | 253.848000 | 241.76000 | 4.761905 | 12.088000 | 7.00000 |
| 75% | 749.250000 | 749.250000 | 749.250000 | 749.250000 | 749.250000 | 749.250000 | 77.935000 | 8.000000 | 22.445250 | 471.350250 | 448.90500 | 4.761905 | 22.445250 | 8.50000 |
| max | 999.000000 | 999.000000 | 999.000000 | 999.000000 | 999.000000 | 999.000000 | 99.960000 | 10.000000 | 49.650000 | 1042.650000 | 993.00000 | 4.761905 | 49.650000 | 10.00000 |
In [323]:
data.dtypes
Out[323]:
Unnamed: 0.5 int64 Unnamed: 0.4 int64 Unnamed: 0.3 int64 Unnamed: 0.2 int64 Unnamed: 0.1 int64 Unnamed: 0 int64 Invoice ID object Branch object City object Customer type object Gender object Product line object Unit price float64 Quantity int64 Tax 5% float64 Total float64 Time object Payment object cogs float64 gross margin percentage float64 gross income float64 Rating float64 Day object Date(dd-mm-yyyy) object Month object dtype: object
In [324]:
data['Quantity'].max()
Out[324]:
np.int64(10)
In [325]:
data['Quantity'].min()
Out[325]:
np.int64(1)
In [326]:
data.groupby('Quantity')['Customer type'].apply(list)
Out[326]:
Quantity 1 [Member, Normal, Normal, Normal, Normal, Norma... 2 [Member, Normal, Normal, Member, Normal, Norma... 3 [Normal, Member, Normal, Member, Member, Norma... 4 [Member, Member, Normal, Member, Member, Membe... 5 [Normal, Normal, Member, Member, Normal, Norma... 6 [Member, Normal, Normal, Normal, Member, Membe... 7 [Member, Normal, Normal, Member, Member, Membe... 8 [Member, Member, Normal, Normal, Normal, Norma... 9 [Normal, Normal, Normal, Member, Member, Norma... 10 [Normal, Normal, Member, Member, Member, Norma... Name: Customer type, dtype: object
HOW MANY MEMBERS in the dataset purchased the product where the Quantity is 6??¶
In [327]:
data.groupby('Quantity')['Customer type'].count()
Out[327]:
Quantity 1 112 2 91 3 90 4 109 5 102 6 98 7 102 8 85 9 92 10 119 Name: Customer type, dtype: int64
In [328]:
data.groupby('Quantity')['Customer type'].size()
Out[328]:
Quantity 1 112 2 91 3 90 4 109 5 102 6 98 7 102 8 85 9 92 10 119 Name: Customer type, dtype: int64
In [329]:
# need to declare a avatiable which will store the grouped values
group= data.groupby(['Quantity' , 'Customer type']).size()
group.loc[ 6 , 'Member']
Out[329]:
np.int64(42)
In [330]:
data.groupby(['Quantity' , 'Customer type']).size()
Out[330]:
Quantity Customer type
1 Member 57
Normal 55
2 Member 43
Normal 48
3 Member 44
Normal 46
4 Member 61
Normal 48
5 Member 48
Normal 54
6 Member 42
Normal 56
7 Member 51
Normal 51
8 Member 41
Normal 44
9 Member 51
Normal 41
10 Member 63
Normal 56
dtype: int64
now checking types of product and their count¶
In [331]:
data.groupby(['Product line','Quantity']).size()
Out[331]:
Product line Quantity
Electronic accessories 1 20
2 8
3 16
4 19
5 17
6 19
7 16
8 17
9 16
10 22
Fashion accessories 1 30
2 22
3 14
4 21
5 15
6 8
7 21
8 12
9 14
10 21
Food and beverages 1 15
2 16
3 23
4 18
5 21
6 17
7 12
8 15
9 17
10 20
Health and beauty 1 15
2 13
3 13
4 15
5 18
6 14
7 19
8 15
9 13
10 17
Home and lifestyle 1 13
2 14
3 13
4 22
5 13
6 21
7 12
8 17
9 18
10 17
Sports and travel 1 19
2 18
3 11
4 14
5 18
6 19
7 22
8 9
9 14
10 22
dtype: int64
no of people who brought electronic items¶
In [332]:
#Storing this data
product=data.groupby(['Product line','Quantity']).size()
product.loc['Electronic accessories'].sum()
Out[332]:
np.int64(170)
SORTING DATA ACCORDING TO THE MONTH¶
In [333]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0.5 1000 non-null int64 1 Unnamed: 0.4 1000 non-null int64 2 Unnamed: 0.3 1000 non-null int64 3 Unnamed: 0.2 1000 non-null int64 4 Unnamed: 0.1 1000 non-null int64 5 Unnamed: 0 1000 non-null int64 6 Invoice ID 1000 non-null object 7 Branch 1000 non-null object 8 City 1000 non-null object 9 Customer type 1000 non-null object 10 Gender 1000 non-null object 11 Product line 1000 non-null object 12 Unit price 1000 non-null float64 13 Quantity 1000 non-null int64 14 Tax 5% 1000 non-null float64 15 Total 1000 non-null float64 16 Time 1000 non-null object 17 Payment 1000 non-null object 18 cogs 1000 non-null float64 19 gross margin percentage 1000 non-null float64 20 gross income 1000 non-null float64 21 Rating 1000 non-null float64 22 Day 1000 non-null object 23 Date(dd-mm-yyyy) 1000 non-null object 24 Month 1000 non-null object dtypes: float64(7), int64(7), object(11) memory usage: 195.4+ KB
In [334]:
data['Date(dd-mm-yyyy)'].head()
Out[334]:
0 5/1/2019 1 3/3/2019 2 27-01-2019 3 8/2/2019 4 25-02-2019 Name: Date(dd-mm-yyyy), dtype: object
In [335]:
# #making a new column for the date format which stores the correct format
# #https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
# #for using dt we need to conver t the column into date time
# data['Date']=pd.to_datetime(data['Date'])
# # created a new column as Date(dd-mm-yyyy)
# data['Date(dd-mm-yyyy)']=data['Date'].dt.strftime('%d-%m-%Y')
need to convert this date format which is in YYYY-MM-DD to DD-MM-YYY¶
In [336]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0.5 1000 non-null int64 1 Unnamed: 0.4 1000 non-null int64 2 Unnamed: 0.3 1000 non-null int64 3 Unnamed: 0.2 1000 non-null int64 4 Unnamed: 0.1 1000 non-null int64 5 Unnamed: 0 1000 non-null int64 6 Invoice ID 1000 non-null object 7 Branch 1000 non-null object 8 City 1000 non-null object 9 Customer type 1000 non-null object 10 Gender 1000 non-null object 11 Product line 1000 non-null object 12 Unit price 1000 non-null float64 13 Quantity 1000 non-null int64 14 Tax 5% 1000 non-null float64 15 Total 1000 non-null float64 16 Time 1000 non-null object 17 Payment 1000 non-null object 18 cogs 1000 non-null float64 19 gross margin percentage 1000 non-null float64 20 gross income 1000 non-null float64 21 Rating 1000 non-null float64 22 Day 1000 non-null object 23 Date(dd-mm-yyyy) 1000 non-null object 24 Month 1000 non-null object dtypes: float64(7), int64(7), object(11) memory usage: 195.4+ KB
In [337]:
#ok so new column is added
data.head(20)
Out[337]:
| Unnamed: 0.5 | Unnamed: 0.4 | Unnamed: 0.3 | Unnamed: 0.2 | Unnamed: 0.1 | Unnamed: 0 | Invoice ID | Branch | City | Customer type | ... | Total | Time | Payment | cogs | gross margin percentage | gross income | Rating | Day | Date(dd-mm-yyyy) | Month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 750-67-8428 | A | Yangon | Member | ... | 548.9715 | 13:08:00 | Ewallet | 522.83 | 4.761905 | 26.1415 | 9.1 | Saturday | 5/1/2019 | January |
| 1 | 1 | 1 | 1 | 1 | 1 | 1 | 631-41-3108 | A | Yangon | Normal | ... | 340.5255 | 13:23:00 | Credit card | 324.31 | 4.761905 | 16.2155 | 7.4 | Sunday | 3/3/2019 | March |
| 2 | 2 | 2 | 2 | 2 | 2 | 2 | 123-19-1176 | A | Yangon | Member | ... | 489.0480 | 20:33:00 | Ewallet | 465.76 | 4.761905 | 23.2880 | 8.4 | Sunday | 27-01-2019 | January |
| 3 | 3 | 3 | 3 | 3 | 3 | 3 | 373-73-7910 | A | Yangon | Normal | ... | 634.3785 | 10:37:00 | Ewallet | 604.17 | 4.761905 | 30.2085 | 5.3 | Friday | 8/2/2019 | February |
| 4 | 4 | 4 | 4 | 4 | 4 | 4 | 355-53-5943 | A | Yangon | Member | ... | 433.6920 | 14:36:00 | Ewallet | 413.04 | 4.761905 | 20.6520 | 5.8 | Monday | 25-02-2019 | February |
| 5 | 5 | 5 | 5 | 5 | 5 | 5 | 665-32-9167 | A | Yangon | Member | ... | 76.1460 | 17:15:00 | Credit card | 72.52 | 4.761905 | 3.6260 | 7.2 | Thursday | 10/1/2019 | January |
| 6 | 6 | 6 | 6 | 6 | 6 | 6 | 365-64-0515 | A | Yangon | Normal | ... | 246.4875 | 10:25:00 | Ewallet | 234.75 | 4.761905 | 11.7375 | 7.1 | Tuesday | 12/2/2019 | February |
| 7 | 7 | 7 | 7 | 7 | 7 | 7 | 252-56-2699 | A | Yangon | Normal | ... | 453.4950 | 16:48:00 | Ewallet | 431.90 | 4.761905 | 21.5950 | 8.2 | Thursday | 7/2/2019 | February |
| 8 | 8 | 8 | 8 | 8 | 8 | 8 | 829-34-3910 | A | Yangon | Normal | ... | 749.4900 | 19:21:00 | Cash | 713.80 | 4.761905 | 35.6900 | 5.7 | Friday | 29-03-2019 | March |
| 9 | 9 | 9 | 9 | 9 | 9 | 9 | 656-95-9349 | A | Yangon | Member | ... | 506.6355 | 11:03:00 | Credit card | 482.51 | 4.761905 | 24.1255 | 4.6 | Monday | 11/3/2019 | March |
| 10 | 10 | 10 | 10 | 10 | 10 | 10 | 765-26-6951 | A | Yangon | Normal | ... | 457.4430 | 10:39:00 | Credit card | 435.66 | 4.761905 | 21.7830 | 6.9 | Tuesday | 1/1/2019 | January |
| 11 | 11 | 11 | 11 | 11 | 11 | 11 | 329-62-1586 | A | Yangon | Normal | ... | 172.2105 | 18:00:00 | Credit card | 164.01 | 4.761905 | 8.2005 | 8.6 | Monday | 21-01-2019 | January |
| 12 | 12 | 12 | 12 | 12 | 12 | 12 | 636-48-8204 | A | Yangon | Normal | ... | 181.4400 | 11:15:00 | Ewallet | 172.80 | 4.761905 | 8.6400 | 9.9 | Sunday | 17-02-2019 | February |
| 13 | 13 | 13 | 13 | 13 | 13 | 13 | 549-59-1358 | A | Yangon | Member | ... | 279.1845 | 17:36:00 | Ewallet | 265.89 | 4.761905 | 13.2945 | 6.0 | Saturday | 2/3/2019 | March |
| 14 | 14 | 14 | 14 | 14 | 14 | 14 | 227-03-5010 | A | Yangon | Member | ... | 441.7560 | 19:20:00 | Credit card | 420.72 | 4.761905 | 21.0360 | 8.5 | Friday | 22-03-2019 | March |
| 15 | 15 | 15 | 15 | 15 | 15 | 15 | 189-17-4241 | A | Yangon | Normal | ... | 184.1070 | 12:17:00 | Credit card | 175.34 | 4.761905 | 8.7670 | 7.7 | Sunday | 10/3/2019 | March |
| 16 | 16 | 16 | 16 | 16 | 16 | 16 | 848-62-7243 | A | Yangon | Normal | ... | 235.2105 | 15:36:00 | Cash | 224.01 | 4.761905 | 11.2005 | 7.4 | Friday | 15-03-2019 | March |
| 17 | 17 | 17 | 17 | 17 | 17 | 17 | 595-11-5460 | A | Yangon | Normal | ... | 202.8180 | 10:12:00 | Credit card | 193.16 | 4.761905 | 9.6580 | 5.1 | Friday | 15-03-2019 | March |
| 18 | 18 | 18 | 18 | 18 | 18 | 18 | 129-29-8530 | A | Yangon | Member | ... | 328.7550 | 19:15:00 | Ewallet | 313.10 | 4.761905 | 15.6550 | 7.0 | Sunday | 10/3/2019 | March |
| 19 | 19 | 19 | 19 | 19 | 19 | 19 | 272-65-1806 | A | Yangon | Normal | ... | 575.3160 | 17:17:00 | Ewallet | 547.92 | 4.761905 | 27.3960 | 4.7 | Tuesday | 15-01-2019 | January |
20 rows × 25 columns
In [338]:
#ok so now we have thwcorrect formatted data so we don't need the old column so we are gonna dele that
data.head()
Out[338]:
| Unnamed: 0.5 | Unnamed: 0.4 | Unnamed: 0.3 | Unnamed: 0.2 | Unnamed: 0.1 | Unnamed: 0 | Invoice ID | Branch | City | Customer type | ... | Total | Time | Payment | cogs | gross margin percentage | gross income | Rating | Day | Date(dd-mm-yyyy) | Month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 750-67-8428 | A | Yangon | Member | ... | 548.9715 | 13:08:00 | Ewallet | 522.83 | 4.761905 | 26.1415 | 9.1 | Saturday | 5/1/2019 | January |
| 1 | 1 | 1 | 1 | 1 | 1 | 1 | 631-41-3108 | A | Yangon | Normal | ... | 340.5255 | 13:23:00 | Credit card | 324.31 | 4.761905 | 16.2155 | 7.4 | Sunday | 3/3/2019 | March |
| 2 | 2 | 2 | 2 | 2 | 2 | 2 | 123-19-1176 | A | Yangon | Member | ... | 489.0480 | 20:33:00 | Ewallet | 465.76 | 4.761905 | 23.2880 | 8.4 | Sunday | 27-01-2019 | January |
| 3 | 3 | 3 | 3 | 3 | 3 | 3 | 373-73-7910 | A | Yangon | Normal | ... | 634.3785 | 10:37:00 | Ewallet | 604.17 | 4.761905 | 30.2085 | 5.3 | Friday | 8/2/2019 | February |
| 4 | 4 | 4 | 4 | 4 | 4 | 4 | 355-53-5943 | A | Yangon | Member | ... | 433.6920 | 14:36:00 | Ewallet | 413.04 | 4.761905 | 20.6520 | 5.8 | Monday | 25-02-2019 | February |
5 rows × 25 columns
In [339]:
# #now sorting according to month
# #need to create a column called month
# data['Date(dd-mm-yyyy)']=pd.to_datetime(data['Date(dd-mm-yyyy)'], format='%d-%m-%Y')
# data['Month']=data['Date(dd-mm-yyyy)'].dt.month_name()
In [340]:
# # Pandas displays datetime objects in the format YYYY-MM-DD by default.
# #to to display the date in dd-mm-yyyy format, just need to format it like this:
# # Useing strftime() if you want the format to look nice for display or export.
# # Don’t use it before extracting month/day, because then it becomes a string again
# data['Date(dd-mm-yyyy)']=data['Date(dd-mm-yyyy)'].dt.strftime('%d-%m-%Y')
In [341]:
data.head(5)
Out[341]:
| Unnamed: 0.5 | Unnamed: 0.4 | Unnamed: 0.3 | Unnamed: 0.2 | Unnamed: 0.1 | Unnamed: 0 | Invoice ID | Branch | City | Customer type | ... | Total | Time | Payment | cogs | gross margin percentage | gross income | Rating | Day | Date(dd-mm-yyyy) | Month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 750-67-8428 | A | Yangon | Member | ... | 548.9715 | 13:08:00 | Ewallet | 522.83 | 4.761905 | 26.1415 | 9.1 | Saturday | 5/1/2019 | January |
| 1 | 1 | 1 | 1 | 1 | 1 | 1 | 631-41-3108 | A | Yangon | Normal | ... | 340.5255 | 13:23:00 | Credit card | 324.31 | 4.761905 | 16.2155 | 7.4 | Sunday | 3/3/2019 | March |
| 2 | 2 | 2 | 2 | 2 | 2 | 2 | 123-19-1176 | A | Yangon | Member | ... | 489.0480 | 20:33:00 | Ewallet | 465.76 | 4.761905 | 23.2880 | 8.4 | Sunday | 27-01-2019 | January |
| 3 | 3 | 3 | 3 | 3 | 3 | 3 | 373-73-7910 | A | Yangon | Normal | ... | 634.3785 | 10:37:00 | Ewallet | 604.17 | 4.761905 | 30.2085 | 5.3 | Friday | 8/2/2019 | February |
| 4 | 4 | 4 | 4 | 4 | 4 | 4 | 355-53-5943 | A | Yangon | Member | ... | 433.6920 | 14:36:00 | Ewallet | 413.04 | 4.761905 | 20.6520 | 5.8 | Monday | 25-02-2019 | February |
5 rows × 25 columns
In [342]:
#taking the count of sales according to month
#month order is in alphabetical order
data.groupby(['Month' , 'Product line']).size()
Out[342]:
Month Product line
February Electronic accessories 54
Fashion accessories 60
Food and beverages 62
Health and beauty 46
Home and lifestyle 38
Sports and travel 43
January Electronic accessories 54
Fashion accessories 64
Food and beverages 56
Health and beauty 49
Home and lifestyle 59
Sports and travel 70
March Electronic accessories 62
Fashion accessories 54
Food and beverages 56
Health and beauty 57
Home and lifestyle 63
Sports and travel 53
dtype: int64
In [343]:
data['Month'].count
Out[343]:
<bound method Series.count of 0 January
1 March
2 January
3 February
4 February
...
995 January
996 January
997 March
998 February
999 January
Name: Month, Length: 1000, dtype: object>
In [344]:
data['Month'].nunique
Out[344]:
<bound method IndexOpsMixin.nunique of 0 January
1 March
2 January
3 February
4 February
...
995 January
996 January
997 March
998 February
999 January
Name: Month, Length: 1000, dtype: object>
In [345]:
data['Month'].nunique()
Out[345]:
3
In [346]:
data.groupby([data['Month']=='January' , data['Product line'] == 'Sports and travel'])
Out[346]:
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000214C7ACE150>
saving the file¶
In [347]:
data.to_csv('dataset.csv' , index='false')
In [348]:
data.head()
Out[348]:
| Unnamed: 0.5 | Unnamed: 0.4 | Unnamed: 0.3 | Unnamed: 0.2 | Unnamed: 0.1 | Unnamed: 0 | Invoice ID | Branch | City | Customer type | ... | Total | Time | Payment | cogs | gross margin percentage | gross income | Rating | Day | Date(dd-mm-yyyy) | Month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 750-67-8428 | A | Yangon | Member | ... | 548.9715 | 13:08:00 | Ewallet | 522.83 | 4.761905 | 26.1415 | 9.1 | Saturday | 5/1/2019 | January |
| 1 | 1 | 1 | 1 | 1 | 1 | 1 | 631-41-3108 | A | Yangon | Normal | ... | 340.5255 | 13:23:00 | Credit card | 324.31 | 4.761905 | 16.2155 | 7.4 | Sunday | 3/3/2019 | March |
| 2 | 2 | 2 | 2 | 2 | 2 | 2 | 123-19-1176 | A | Yangon | Member | ... | 489.0480 | 20:33:00 | Ewallet | 465.76 | 4.761905 | 23.2880 | 8.4 | Sunday | 27-01-2019 | January |
| 3 | 3 | 3 | 3 | 3 | 3 | 3 | 373-73-7910 | A | Yangon | Normal | ... | 634.3785 | 10:37:00 | Ewallet | 604.17 | 4.761905 | 30.2085 | 5.3 | Friday | 8/2/2019 | February |
| 4 | 4 | 4 | 4 | 4 | 4 | 4 | 355-53-5943 | A | Yangon | Member | ... | 433.6920 | 14:36:00 | Ewallet | 413.04 | 4.761905 | 20.6520 | 5.8 | Monday | 25-02-2019 | February |
5 rows × 25 columns
In [349]:
data['Product line'].unique()
Out[349]:
array(['Health and beauty', 'Home and lifestyle', 'Sports and travel',
'Electronic accessories', 'Food and beverages',
'Fashion accessories'], dtype=object)
In [350]:
data['Product line'].nunique()
Out[350]:
6
In [351]:
data.groupby(['Month' , 'Product line']).size()
Out[351]:
Month Product line
February Electronic accessories 54
Fashion accessories 60
Food and beverages 62
Health and beauty 46
Home and lifestyle 38
Sports and travel 43
January Electronic accessories 54
Fashion accessories 64
Food and beverages 56
Health and beauty 49
Home and lifestyle 59
Sports and travel 70
March Electronic accessories 62
Fashion accessories 54
Food and beverages 56
Health and beauty 57
Home and lifestyle 63
Sports and travel 53
dtype: int64
In [352]:
data.groupby(['Month' , 'Product line' , 'Quantity']).size()
Out[352]:
Month Product line Quantity
February Electronic accessories 1 10
2 2
3 3
4 2
5 5
..
March Sports and travel 6 6
7 7
8 4
9 5
10 8
Length: 180, dtype: int64
In [353]:
data.head(10)
Out[353]:
| Unnamed: 0.5 | Unnamed: 0.4 | Unnamed: 0.3 | Unnamed: 0.2 | Unnamed: 0.1 | Unnamed: 0 | Invoice ID | Branch | City | Customer type | ... | Total | Time | Payment | cogs | gross margin percentage | gross income | Rating | Day | Date(dd-mm-yyyy) | Month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 750-67-8428 | A | Yangon | Member | ... | 548.9715 | 13:08:00 | Ewallet | 522.83 | 4.761905 | 26.1415 | 9.1 | Saturday | 5/1/2019 | January |
| 1 | 1 | 1 | 1 | 1 | 1 | 1 | 631-41-3108 | A | Yangon | Normal | ... | 340.5255 | 13:23:00 | Credit card | 324.31 | 4.761905 | 16.2155 | 7.4 | Sunday | 3/3/2019 | March |
| 2 | 2 | 2 | 2 | 2 | 2 | 2 | 123-19-1176 | A | Yangon | Member | ... | 489.0480 | 20:33:00 | Ewallet | 465.76 | 4.761905 | 23.2880 | 8.4 | Sunday | 27-01-2019 | January |
| 3 | 3 | 3 | 3 | 3 | 3 | 3 | 373-73-7910 | A | Yangon | Normal | ... | 634.3785 | 10:37:00 | Ewallet | 604.17 | 4.761905 | 30.2085 | 5.3 | Friday | 8/2/2019 | February |
| 4 | 4 | 4 | 4 | 4 | 4 | 4 | 355-53-5943 | A | Yangon | Member | ... | 433.6920 | 14:36:00 | Ewallet | 413.04 | 4.761905 | 20.6520 | 5.8 | Monday | 25-02-2019 | February |
| 5 | 5 | 5 | 5 | 5 | 5 | 5 | 665-32-9167 | A | Yangon | Member | ... | 76.1460 | 17:15:00 | Credit card | 72.52 | 4.761905 | 3.6260 | 7.2 | Thursday | 10/1/2019 | January |
| 6 | 6 | 6 | 6 | 6 | 6 | 6 | 365-64-0515 | A | Yangon | Normal | ... | 246.4875 | 10:25:00 | Ewallet | 234.75 | 4.761905 | 11.7375 | 7.1 | Tuesday | 12/2/2019 | February |
| 7 | 7 | 7 | 7 | 7 | 7 | 7 | 252-56-2699 | A | Yangon | Normal | ... | 453.4950 | 16:48:00 | Ewallet | 431.90 | 4.761905 | 21.5950 | 8.2 | Thursday | 7/2/2019 | February |
| 8 | 8 | 8 | 8 | 8 | 8 | 8 | 829-34-3910 | A | Yangon | Normal | ... | 749.4900 | 19:21:00 | Cash | 713.80 | 4.761905 | 35.6900 | 5.7 | Friday | 29-03-2019 | March |
| 9 | 9 | 9 | 9 | 9 | 9 | 9 | 656-95-9349 | A | Yangon | Member | ... | 506.6355 | 11:03:00 | Credit card | 482.51 | 4.761905 | 24.1255 | 4.6 | Monday | 11/3/2019 | March |
10 rows × 25 columns
In [354]:
x=data.groupby(['Month' , 'Product line']).size()
x
Out[354]:
Month Product line
February Electronic accessories 54
Fashion accessories 60
Food and beverages 62
Health and beauty 46
Home and lifestyle 38
Sports and travel 43
January Electronic accessories 54
Fashion accessories 64
Food and beverages 56
Health and beauty 49
Home and lifestyle 59
Sports and travel 70
March Electronic accessories 62
Fashion accessories 54
Food and beverages 56
Health and beauty 57
Home and lifestyle 63
Sports and travel 53
dtype: int64
In [355]:
#size() = Compute the number of values in each group
y=data.groupby(['Product line' , 'Quantity']).size()
y
Out[355]:
Product line Quantity
Electronic accessories 1 20
2 8
3 16
4 19
5 17
6 19
7 16
8 17
9 16
10 22
Fashion accessories 1 30
2 22
3 14
4 21
5 15
6 8
7 21
8 12
9 14
10 21
Food and beverages 1 15
2 16
3 23
4 18
5 21
6 17
7 12
8 15
9 17
10 20
Health and beauty 1 15
2 13
3 13
4 15
5 18
6 14
7 19
8 15
9 13
10 17
Home and lifestyle 1 13
2 14
3 13
4 22
5 13
6 21
7 12
8 17
9 18
10 17
Sports and travel 1 19
2 18
3 11
4 14
5 18
6 19
7 22
8 9
9 14
10 22
dtype: int64
In [356]:
data['Quantity'].sum()
Out[356]:
np.int64(5510)
In [357]:
pd.concat([x ,y],axis=0).head(10)
Out[357]:
February Electronic accessories 54
Fashion accessories 60
Food and beverages 62
Health and beauty 46
Home and lifestyle 38
Sports and travel 43
January Electronic accessories 54
Fashion accessories 64
Food and beverages 56
Health and beauty 49
dtype: int64
In [358]:
z=data.groupby(['Month' , 'Quantity']).size()
z
Out[358]:
Month Quantity
February 1 42
2 24
3 29
4 30
5 30
6 30
7 27
8 23
9 26
10 42
January 1 41
2 32
3 26
4 36
5 34
6 40
7 36
8 30
9 34
10 43
March 1 29
2 35
3 35
4 43
5 38
6 28
7 39
8 32
9 32
10 34
dtype: int64
In [359]:
z.plot.bar()
Out[359]:
<Axes: xlabel='Month,Quantity'>
In [360]:
### Need to make this graph more visually appealing
In [361]:
data.groupby(['Month' ,'Date(dd-mm-yyyy)', 'Product line' , 'Quantity' ]).size().reset_index(name='Count').head(50)
Out[361]:
| Month | Date(dd-mm-yyyy) | Product line | Quantity | Count | |
|---|---|---|---|---|---|
| 0 | February | 1/2/2019 | Electronic accessories | 9 | 1 |
| 1 | February | 1/2/2019 | Food and beverages | 5 | 1 |
| 2 | February | 1/2/2019 | Food and beverages | 6 | 1 |
| 3 | February | 1/2/2019 | Health and beauty | 4 | 1 |
| 4 | February | 1/2/2019 | Home and lifestyle | 9 | 1 |
| 5 | February | 1/2/2019 | Sports and travel | 7 | 1 |
| 6 | February | 10/2/2019 | Electronic accessories | 3 | 1 |
| 7 | February | 10/2/2019 | Electronic accessories | 6 | 1 |
| 8 | February | 10/2/2019 | Electronic accessories | 10 | 1 |
| 9 | February | 10/2/2019 | Fashion accessories | 7 | 2 |
| 10 | February | 10/2/2019 | Fashion accessories | 10 | 1 |
| 11 | February | 10/2/2019 | Food and beverages | 5 | 1 |
| 12 | February | 10/2/2019 | Health and beauty | 2 | 1 |
| 13 | February | 10/2/2019 | Home and lifestyle | 1 | 1 |
| 14 | February | 10/2/2019 | Home and lifestyle | 7 | 1 |
| 15 | February | 10/2/2019 | Sports and travel | 4 | 1 |
| 16 | February | 11/2/2019 | Fashion accessories | 10 | 1 |
| 17 | February | 11/2/2019 | Food and beverages | 10 | 2 |
| 18 | February | 11/2/2019 | Health and beauty | 9 | 1 |
| 19 | February | 11/2/2019 | Home and lifestyle | 3 | 2 |
| 20 | February | 11/2/2019 | Home and lifestyle | 5 | 1 |
| 21 | February | 11/2/2019 | Sports and travel | 4 | 1 |
| 22 | February | 12/2/2019 | Electronic accessories | 1 | 1 |
| 23 | February | 12/2/2019 | Electronic accessories | 5 | 1 |
| 24 | February | 12/2/2019 | Fashion accessories | 1 | 1 |
| 25 | February | 12/2/2019 | Fashion accessories | 8 | 1 |
| 26 | February | 12/2/2019 | Food and beverages | 10 | 1 |
| 27 | February | 12/2/2019 | Health and beauty | 6 | 1 |
| 28 | February | 12/2/2019 | Home and lifestyle | 10 | 1 |
| 29 | February | 12/2/2019 | Sports and travel | 2 | 1 |
| 30 | February | 13-02-2019 | Electronic accessories | 1 | 1 |
| 31 | February | 13-02-2019 | Electronic accessories | 4 | 1 |
| 32 | February | 13-02-2019 | Fashion accessories | 2 | 1 |
| 33 | February | 13-02-2019 | Fashion accessories | 5 | 1 |
| 34 | February | 13-02-2019 | Fashion accessories | 7 | 1 |
| 35 | February | 13-02-2019 | Food and beverages | 2 | 1 |
| 36 | February | 13-02-2019 | Sports and travel | 4 | 1 |
| 37 | February | 13-02-2019 | Sports and travel | 6 | 1 |
| 38 | February | 14-02-2019 | Electronic accessories | 8 | 1 |
| 39 | February | 14-02-2019 | Food and beverages | 2 | 1 |
| 40 | February | 14-02-2019 | Food and beverages | 10 | 1 |
| 41 | February | 14-02-2019 | Health and beauty | 1 | 1 |
| 42 | February | 14-02-2019 | Health and beauty | 2 | 1 |
| 43 | February | 14-02-2019 | Health and beauty | 4 | 1 |
| 44 | February | 14-02-2019 | Sports and travel | 3 | 1 |
| 45 | February | 14-02-2019 | Sports and travel | 7 | 1 |
| 46 | February | 15-02-2019 | Electronic accessories | 1 | 1 |
| 47 | February | 15-02-2019 | Electronic accessories | 9 | 1 |
| 48 | February | 15-02-2019 | Electronic accessories | 10 | 1 |
| 49 | February | 15-02-2019 | Fashion accessories | 3 | 2 |
In [362]:
GROUP_1_DATA=data.groupby(['Month' ,'Date(dd-mm-yyyy)','Day', 'Product line' , 'Quantity' ]).size().reset_index(name='Count')
GROUP_1_DATA.to_csv('GROUP_1_DATA .csv' , index= False )
In [363]:
data2=pd.read_csv('GROUP_1_DATA .csv')
data2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 903 entries, 0 to 902 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Month 903 non-null object 1 Date(dd-mm-yyyy) 903 non-null object 2 Day 903 non-null object 3 Product line 903 non-null object 4 Quantity 903 non-null int64 5 Count 903 non-null int64 dtypes: int64(2), object(4) memory usage: 42.5+ KB
In [364]:
data2['Month'].unique()
Out[364]:
array(['February', 'January', 'March'], dtype=object)
In [365]:
data2['Product line'].unique()
Out[365]:
array(['Electronic accessories', 'Food and beverages',
'Health and beauty', 'Home and lifestyle', 'Sports and travel',
'Fashion accessories'], dtype=object)
In [366]:
data2.groupby(['Month', 'Day' , 'Product line' ])['Quantity'].sum().reset_index(name='Count').head(20)
Out[366]:
| Month | Day | Product line | Count | |
|---|---|---|---|---|
| 0 | February | Friday | Electronic accessories | 39 |
| 1 | February | Friday | Fashion accessories | 56 |
| 2 | February | Friday | Food and beverages | 24 |
| 3 | February | Friday | Health and beauty | 36 |
| 4 | February | Friday | Home and lifestyle | 38 |
| 5 | February | Friday | Sports and travel | 55 |
| 6 | February | Monday | Electronic accessories | 31 |
| 7 | February | Monday | Fashion accessories | 35 |
| 8 | February | Monday | Food and beverages | 23 |
| 9 | February | Monday | Health and beauty | 39 |
| 10 | February | Monday | Home and lifestyle | 29 |
| 11 | February | Monday | Sports and travel | 15 |
| 12 | February | Saturday | Electronic accessories | 44 |
| 13 | February | Saturday | Fashion accessories | 54 |
| 14 | February | Saturday | Food and beverages | 56 |
| 15 | February | Saturday | Health and beauty | 21 |
| 16 | February | Saturday | Home and lifestyle | 20 |
| 17 | February | Saturday | Sports and travel | 35 |
| 18 | February | Sunday | Electronic accessories | 48 |
| 19 | February | Sunday | Fashion accessories | 28 |
In [367]:
#storing the grouped dataframe in the variable called graph
graph=data2.groupby(['Month' ,'Day','Product line'])['Quantity'].sum().reset_index(name='Count')
graph
Out[367]:
| Month | Day | Product line | Count | |
|---|---|---|---|---|
| 0 | February | Friday | Electronic accessories | 39 |
| 1 | February | Friday | Fashion accessories | 56 |
| 2 | February | Friday | Food and beverages | 24 |
| 3 | February | Friday | Health and beauty | 36 |
| 4 | February | Friday | Home and lifestyle | 38 |
| ... | ... | ... | ... | ... |
| 121 | March | Wednesday | Fashion accessories | 39 |
| 122 | March | Wednesday | Food and beverages | 37 |
| 123 | March | Wednesday | Health and beauty | 19 |
| 124 | March | Wednesday | Home and lifestyle | 39 |
| 125 | March | Wednesday | Sports and travel | 70 |
126 rows × 4 columns
In [368]:
graph.columns
Out[368]:
Index(['Month', 'Day', 'Product line', 'Count'], dtype='object')
In [369]:
graph['Month'].unique()
Out[369]:
array(['February', 'January', 'March'], dtype=object)
PLOTTING THE GRAPH¶
In [370]:
plt.figure(figsize=(20, 10)) #size of figure 20 bredth 10 height
Out[370]:
<Figure size 2000x1000 with 0 Axes>
<Figure size 2000x1000 with 0 Axes>
Quantity of product sold on week days of the month¶
In [371]:
g= sns.catplot(data=graph
,x='Day' ,
y='Count' ,
hue='Product line' ,
row='Month' ,
kind='bar' ,
height=5,
aspect=1.5,
palette='Set2',
sharex=False)
#sharex=false for dispplaying the days in all the graphs
# change col to row to make it vertical
plt.show()
MAKING IT INTERACTIVE USING PLOTLY EXPRESS¶
In [454]:
g= px.bar(
graph,
x="Day",
y="Count",
color="Product line",
facet_row="Month",
barmode="group",
height=1400,
color_discrete_sequence=px.colors.qualitative.Set2,
title="Sales by Product Line and Day (Interactive)"
)
g.update_xaxes(matches='x')
g.update_yaxes(matches='y')
g.show()
In [455]:
g.for_each_xaxis(lambda axis :
axis.update(title='Day'))
g.update_xaxes(showticklabels=True)
In [374]:
data2.columns
Out[374]:
Index(['Month', 'Date(dd-mm-yyyy)', 'Day', 'Product line', 'Quantity',
'Count'],
dtype='object')
In [375]:
data2['Product line'].unique()
Out[375]:
array(['Electronic accessories', 'Food and beverages',
'Health and beauty', 'Home and lifestyle', 'Sports and travel',
'Fashion accessories'], dtype=object)
In [376]:
data2['Product line'].nunique()
Out[376]:
6
In [377]:
data2['Product line'].count()
Out[377]:
np.int64(903)
In [378]:
data2.describe()
Out[378]:
| Quantity | Count | |
|---|---|---|
| count | 903.000000 | 903.000000 |
| mean | 5.513843 | 1.107420 |
| std | 2.930378 | 0.333928 |
| min | 1.000000 | 1.000000 |
| 25% | 3.000000 | 1.000000 |
| 50% | 5.000000 | 1.000000 |
| 75% | 8.000000 | 1.000000 |
| max | 10.000000 | 3.000000 |
In [379]:
data2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 903 entries, 0 to 902 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Month 903 non-null object 1 Date(dd-mm-yyyy) 903 non-null object 2 Day 903 non-null object 3 Product line 903 non-null object 4 Quantity 903 non-null int64 5 Count 903 non-null int64 dtypes: int64(2), object(4) memory usage: 42.5+ KB
In [380]:
#data is clean and no missing columns are there in it 903== 903`
adding tax column to data 2¶
In [381]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0.5 1000 non-null int64 1 Unnamed: 0.4 1000 non-null int64 2 Unnamed: 0.3 1000 non-null int64 3 Unnamed: 0.2 1000 non-null int64 4 Unnamed: 0.1 1000 non-null int64 5 Unnamed: 0 1000 non-null int64 6 Invoice ID 1000 non-null object 7 Branch 1000 non-null object 8 City 1000 non-null object 9 Customer type 1000 non-null object 10 Gender 1000 non-null object 11 Product line 1000 non-null object 12 Unit price 1000 non-null float64 13 Quantity 1000 non-null int64 14 Tax 5% 1000 non-null float64 15 Total 1000 non-null float64 16 Time 1000 non-null object 17 Payment 1000 non-null object 18 cogs 1000 non-null float64 19 gross margin percentage 1000 non-null float64 20 gross income 1000 non-null float64 21 Rating 1000 non-null float64 22 Day 1000 non-null object 23 Date(dd-mm-yyyy) 1000 non-null object 24 Month 1000 non-null object dtypes: float64(7), int64(7), object(11) memory usage: 195.4+ KB
In [382]:
data.columns
Out[382]:
Index(['Unnamed: 0.5', 'Unnamed: 0.4', 'Unnamed: 0.3', 'Unnamed: 0.2',
'Unnamed: 0.1', 'Unnamed: 0', 'Invoice ID', 'Branch', 'City',
'Customer type', 'Gender', 'Product line', 'Unit price', 'Quantity',
'Tax 5%', 'Total', 'Time', 'Payment', 'cogs', 'gross margin percentage',
'gross income', 'Rating', 'Day', 'Date(dd-mm-yyyy)', 'Month'],
dtype='object')
In [383]:
data2['Tax']=data['Tax 5%'].reindex(data2.index) # in data 2 tax column wil contain the values from the column tax 5% iin data
In [384]:
data2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 903 entries, 0 to 902 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Month 903 non-null object 1 Date(dd-mm-yyyy) 903 non-null object 2 Day 903 non-null object 3 Product line 903 non-null object 4 Quantity 903 non-null int64 5 Count 903 non-null int64 6 Tax 903 non-null float64 dtypes: float64(1), int64(2), object(4) memory usage: 49.5+ KB
In [385]:
data2.describe()
Out[385]:
| Quantity | Count | Tax | |
|---|---|---|---|
| count | 903.000000 | 903.000000 | 903.000000 |
| mean | 5.513843 | 1.107420 | 15.303181 |
| std | 2.930378 | 0.333928 | 11.657250 |
| min | 1.000000 | 1.000000 | 0.604500 |
| 25% | 3.000000 | 1.000000 | 5.966000 |
| 50% | 5.000000 | 1.000000 | 12.048000 |
| 75% | 8.000000 | 1.000000 | 22.258500 |
| max | 10.000000 | 3.000000 | 49.650000 |
In [386]:
data2.head(10)
Out[386]:
| Month | Date(dd-mm-yyyy) | Day | Product line | Quantity | Count | Tax | |
|---|---|---|---|---|---|---|---|
| 0 | February | 1/2/2019 | Friday | Electronic accessories | 9 | 1 | 26.1415 |
| 1 | February | 1/2/2019 | Friday | Food and beverages | 5 | 1 | 16.2155 |
| 2 | February | 1/2/2019 | Friday | Food and beverages | 6 | 1 | 23.2880 |
| 3 | February | 1/2/2019 | Friday | Health and beauty | 4 | 1 | 30.2085 |
| 4 | February | 1/2/2019 | Friday | Home and lifestyle | 9 | 1 | 20.6520 |
| 5 | February | 1/2/2019 | Friday | Sports and travel | 7 | 1 | 3.6260 |
| 6 | February | 10/2/2019 | Sunday | Electronic accessories | 3 | 1 | 11.7375 |
| 7 | February | 10/2/2019 | Sunday | Electronic accessories | 6 | 1 | 21.5950 |
| 8 | February | 10/2/2019 | Sunday | Electronic accessories | 10 | 1 | 35.6900 |
| 9 | February | 10/2/2019 | Sunday | Fashion accessories | 7 | 2 | 24.1255 |
In [387]:
data2.columns
Out[387]:
Index(['Month', 'Date(dd-mm-yyyy)', 'Day', 'Product line', 'Quantity', 'Count',
'Tax'],
dtype='object')
In [388]:
data2.columns.tolist()
Out[388]:
['Month', 'Date(dd-mm-yyyy)', 'Day', 'Product line', 'Quantity', 'Count', 'Tax']
In [389]:
data2.columns
Out[389]:
Index(['Month', 'Date(dd-mm-yyyy)', 'Day', 'Product line', 'Quantity', 'Count',
'Tax'],
dtype='object')
In [390]:
data2.drop('Tax' , axis=1, ) # axis 1 cuz its column and if it would have been rows the axis =0
Out[390]:
| Month | Date(dd-mm-yyyy) | Day | Product line | Quantity | Count | |
|---|---|---|---|---|---|---|
| 0 | February | 1/2/2019 | Friday | Electronic accessories | 9 | 1 |
| 1 | February | 1/2/2019 | Friday | Food and beverages | 5 | 1 |
| 2 | February | 1/2/2019 | Friday | Food and beverages | 6 | 1 |
| 3 | February | 1/2/2019 | Friday | Health and beauty | 4 | 1 |
| 4 | February | 1/2/2019 | Friday | Home and lifestyle | 9 | 1 |
| ... | ... | ... | ... | ... | ... | ... |
| 898 | March | 9/3/2019 | Saturday | Home and lifestyle | 7 | 1 |
| 899 | March | 9/3/2019 | Saturday | Sports and travel | 2 | 1 |
| 900 | March | 9/3/2019 | Saturday | Sports and travel | 5 | 1 |
| 901 | March | 9/3/2019 | Saturday | Sports and travel | 9 | 1 |
| 902 | March | 9/3/2019 | Saturday | Sports and travel | 10 | 1 |
903 rows × 6 columns
In [391]:
data2.to_csv('Tax_dataset.csv' ,index=False)
In [392]:
# need to remove the tax column from this data set cuz the values are not correct
Need to group data so the values matches correctly¶
In [393]:
data2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 903 entries, 0 to 902 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Month 903 non-null object 1 Date(dd-mm-yyyy) 903 non-null object 2 Day 903 non-null object 3 Product line 903 non-null object 4 Quantity 903 non-null int64 5 Count 903 non-null int64 6 Tax 903 non-null float64 dtypes: float64(1), int64(2), object(4) memory usage: 49.5+ KB
In [394]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0.5 1000 non-null int64 1 Unnamed: 0.4 1000 non-null int64 2 Unnamed: 0.3 1000 non-null int64 3 Unnamed: 0.2 1000 non-null int64 4 Unnamed: 0.1 1000 non-null int64 5 Unnamed: 0 1000 non-null int64 6 Invoice ID 1000 non-null object 7 Branch 1000 non-null object 8 City 1000 non-null object 9 Customer type 1000 non-null object 10 Gender 1000 non-null object 11 Product line 1000 non-null object 12 Unit price 1000 non-null float64 13 Quantity 1000 non-null int64 14 Tax 5% 1000 non-null float64 15 Total 1000 non-null float64 16 Time 1000 non-null object 17 Payment 1000 non-null object 18 cogs 1000 non-null float64 19 gross margin percentage 1000 non-null float64 20 gross income 1000 non-null float64 21 Rating 1000 non-null float64 22 Day 1000 non-null object 23 Date(dd-mm-yyyy) 1000 non-null object 24 Month 1000 non-null object dtypes: float64(7), int64(7), object(11) memory usage: 195.4+ KB
Grouping the coulumns city , Productline , unit price , Quantity , tax 5% on total , Total ,Month ,¶
In [395]:
TAX_GROUP=data.groupby(['Month','Product line', 'Quantity' ,'Unit price','Tax 5%','Total']).size().reset_index(name='Count')
In [396]:
TAX_GROUP
Out[396]:
| Month | Product line | Quantity | Unit price | Tax 5% | Total | Count | |
|---|---|---|---|---|---|---|---|
| 0 | February | Electronic accessories | 1 | 28.96 | 1.4480 | 30.4080 | 1 |
| 1 | February | Electronic accessories | 1 | 39.48 | 1.9740 | 41.4540 | 1 |
| 2 | February | Electronic accessories | 1 | 39.75 | 1.9875 | 41.7375 | 1 |
| 3 | February | Electronic accessories | 1 | 60.30 | 3.0150 | 63.3150 | 1 |
| 4 | February | Electronic accessories | 1 | 60.95 | 3.0475 | 63.9975 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | March | Sports and travel | 10 | 44.02 | 22.0100 | 462.2100 | 1 |
| 995 | March | Sports and travel | 10 | 52.26 | 26.1300 | 548.7300 | 1 |
| 996 | March | Sports and travel | 10 | 54.55 | 27.2750 | 572.7750 | 1 |
| 997 | March | Sports and travel | 10 | 69.74 | 34.8700 | 732.2700 | 1 |
| 998 | March | Sports and travel | 10 | 76.92 | 38.4600 | 807.6600 | 1 |
999 rows × 7 columns
In [397]:
TAX_GROUP.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 999 entries, 0 to 998 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Month 999 non-null object 1 Product line 999 non-null object 2 Quantity 999 non-null int64 3 Unit price 999 non-null float64 4 Tax 5% 999 non-null float64 5 Total 999 non-null float64 6 Count 999 non-null int64 dtypes: float64(3), int64(2), object(2) memory usage: 54.8+ KB
In [398]:
TAX_GROUP.to_csv('Tax_dataset.csv', index=False)
In [399]:
data3=pd.read_csv('Tax_dataset.csv')
In [400]:
data3.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 999 entries, 0 to 998 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Month 999 non-null object 1 Product line 999 non-null object 2 Quantity 999 non-null int64 3 Unit price 999 non-null float64 4 Tax 5% 999 non-null float64 5 Total 999 non-null float64 6 Count 999 non-null int64 dtypes: float64(3), int64(2), object(2) memory usage: 54.8+ KB
In [401]:
data3.head(50)
Out[401]:
| Month | Product line | Quantity | Unit price | Tax 5% | Total | Count | |
|---|---|---|---|---|---|---|---|
| 0 | February | Electronic accessories | 1 | 28.96 | 1.4480 | 30.4080 | 1 |
| 1 | February | Electronic accessories | 1 | 39.48 | 1.9740 | 41.4540 | 1 |
| 2 | February | Electronic accessories | 1 | 39.75 | 1.9875 | 41.7375 | 1 |
| 3 | February | Electronic accessories | 1 | 60.30 | 3.0150 | 63.3150 | 1 |
| 4 | February | Electronic accessories | 1 | 60.95 | 3.0475 | 63.9975 | 1 |
| 5 | February | Electronic accessories | 1 | 62.48 | 3.1240 | 65.6040 | 1 |
| 6 | February | Electronic accessories | 1 | 71.95 | 3.5975 | 75.5475 | 1 |
| 7 | February | Electronic accessories | 1 | 76.82 | 3.8410 | 80.6610 | 1 |
| 8 | February | Electronic accessories | 1 | 98.84 | 4.9420 | 103.7820 | 1 |
| 9 | February | Electronic accessories | 1 | 99.69 | 4.9845 | 104.6745 | 1 |
| 10 | February | Electronic accessories | 2 | 20.89 | 2.0890 | 43.8690 | 1 |
| 11 | February | Electronic accessories | 2 | 46.61 | 4.6610 | 97.8810 | 1 |
| 12 | February | Electronic accessories | 3 | 48.09 | 7.2135 | 151.4835 | 1 |
| 13 | February | Electronic accessories | 3 | 81.40 | 12.2100 | 256.4100 | 1 |
| 14 | February | Electronic accessories | 3 | 94.64 | 14.1960 | 298.1160 | 1 |
| 15 | February | Electronic accessories | 4 | 32.25 | 6.4500 | 135.4500 | 1 |
| 16 | February | Electronic accessories | 4 | 65.94 | 13.1880 | 276.9480 | 1 |
| 17 | February | Electronic accessories | 5 | 11.81 | 2.9525 | 62.0025 | 1 |
| 18 | February | Electronic accessories | 5 | 12.05 | 3.0125 | 63.2625 | 1 |
| 19 | February | Electronic accessories | 5 | 34.56 | 8.6400 | 181.4400 | 1 |
| 20 | February | Electronic accessories | 5 | 46.95 | 11.7375 | 246.4875 | 1 |
| 21 | February | Electronic accessories | 5 | 86.04 | 21.5100 | 451.7100 | 1 |
| 22 | February | Electronic accessories | 6 | 12.45 | 3.7350 | 78.4350 | 1 |
| 23 | February | Electronic accessories | 6 | 18.93 | 5.6790 | 119.2590 | 1 |
| 24 | February | Electronic accessories | 6 | 35.49 | 10.6470 | 223.5870 | 1 |
| 25 | February | Electronic accessories | 6 | 46.02 | 13.8060 | 289.9260 | 1 |
| 26 | February | Electronic accessories | 6 | 50.45 | 15.1350 | 317.8350 | 1 |
| 27 | February | Electronic accessories | 6 | 68.84 | 20.6520 | 433.6920 | 1 |
| 28 | February | Electronic accessories | 6 | 87.45 | 26.2350 | 550.9350 | 1 |
| 29 | February | Electronic accessories | 6 | 90.70 | 27.2100 | 571.4100 | 1 |
| 30 | February | Electronic accessories | 7 | 25.22 | 8.8270 | 185.3670 | 1 |
| 31 | February | Electronic accessories | 7 | 26.26 | 9.1910 | 193.0110 | 1 |
| 32 | February | Electronic accessories | 7 | 74.58 | 26.1030 | 548.1630 | 1 |
| 33 | February | Electronic accessories | 7 | 92.60 | 32.4100 | 680.6100 | 1 |
| 34 | February | Electronic accessories | 8 | 14.96 | 5.9840 | 125.6640 | 1 |
| 35 | February | Electronic accessories | 8 | 35.74 | 14.2960 | 300.2160 | 1 |
| 36 | February | Electronic accessories | 8 | 40.86 | 16.3440 | 343.2240 | 1 |
| 37 | February | Electronic accessories | 8 | 57.91 | 23.1640 | 486.4440 | 1 |
| 38 | February | Electronic accessories | 8 | 71.89 | 28.7560 | 603.8760 | 1 |
| 39 | February | Electronic accessories | 8 | 85.98 | 34.3920 | 722.2320 | 1 |
| 40 | February | Electronic accessories | 8 | 99.56 | 39.8240 | 836.3040 | 1 |
| 41 | February | Electronic accessories | 9 | 23.07 | 10.3815 | 218.0115 | 1 |
| 42 | February | Electronic accessories | 9 | 69.58 | 31.3110 | 657.5310 | 1 |
| 43 | February | Electronic accessories | 9 | 75.59 | 34.0155 | 714.3255 | 1 |
| 44 | February | Electronic accessories | 9 | 77.63 | 34.9335 | 733.6035 | 1 |
| 45 | February | Electronic accessories | 9 | 88.25 | 39.7125 | 833.9625 | 1 |
| 46 | February | Electronic accessories | 10 | 17.42 | 8.7100 | 182.9100 | 1 |
| 47 | February | Electronic accessories | 10 | 22.95 | 11.4750 | 240.9750 | 1 |
| 48 | February | Electronic accessories | 10 | 24.74 | 12.3700 | 259.7700 | 1 |
| 49 | February | Electronic accessories | 10 | 32.80 | 16.4000 | 344.4000 | 1 |
In [402]:
T1=data3.groupby(['Month' , 'Product line'])['Tax 5%'].sum().reset_index()
tax_graph= px.bar(T1 ,y='Tax 5%' , x='Product line' , color='Product line' ,facet_row='Month' , title='Tax graph' , height=1200)
tax_graph.update_xaxes(showticklabels=True)
tax_graph.show()
In [403]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0.5 1000 non-null int64 1 Unnamed: 0.4 1000 non-null int64 2 Unnamed: 0.3 1000 non-null int64 3 Unnamed: 0.2 1000 non-null int64 4 Unnamed: 0.1 1000 non-null int64 5 Unnamed: 0 1000 non-null int64 6 Invoice ID 1000 non-null object 7 Branch 1000 non-null object 8 City 1000 non-null object 9 Customer type 1000 non-null object 10 Gender 1000 non-null object 11 Product line 1000 non-null object 12 Unit price 1000 non-null float64 13 Quantity 1000 non-null int64 14 Tax 5% 1000 non-null float64 15 Total 1000 non-null float64 16 Time 1000 non-null object 17 Payment 1000 non-null object 18 cogs 1000 non-null float64 19 gross margin percentage 1000 non-null float64 20 gross income 1000 non-null float64 21 Rating 1000 non-null float64 22 Day 1000 non-null object 23 Date(dd-mm-yyyy) 1000 non-null object 24 Month 1000 non-null object dtypes: float64(7), int64(7), object(11) memory usage: 195.4+ KB
rating vs product line which product line has the maximum rating¶
In [404]:
rating_data=data.groupby(['City','Product line' ,'Rating']).size().reset_index()
rating_data
Out[404]:
| City | Product line | Rating | 0 | |
|---|---|---|---|---|
| 0 | Mandalay | Electronic accessories | 4.0 | 1 |
| 1 | Mandalay | Electronic accessories | 4.2 | 1 |
| 2 | Mandalay | Electronic accessories | 4.3 | 1 |
| 3 | Mandalay | Electronic accessories | 4.8 | 1 |
| 4 | Mandalay | Electronic accessories | 4.9 | 1 |
| ... | ... | ... | ... | ... |
| 669 | Yangon | Sports and travel | 9.3 | 2 |
| 670 | Yangon | Sports and travel | 9.6 | 3 |
| 671 | Yangon | Sports and travel | 9.7 | 3 |
| 672 | Yangon | Sports and travel | 9.8 | 1 |
| 673 | Yangon | Sports and travel | 9.9 | 1 |
674 rows × 4 columns
In [405]:
rating_graph = rating_data.pivot_table( columns='Product line' , index='City' ,values='Rating' , aggfunc='mean' ).reset_index()
In [406]:
rating_graph.columns.name=None
In [407]:
rating_graph
Out[407]:
| City | Electronic accessories | Fashion accessories | Food and beverages | Health and beauty | Home and lifestyle | Sports and travel | |
|---|---|---|---|---|---|---|---|
| 0 | Mandalay | 7.069231 | 6.629268 | 6.897222 | 7.035294 | 6.802857 | 6.682051 |
| 1 | Naypyitaw | 6.887500 | 7.217073 | 7.140909 | 7.102778 | 7.077419 | 7.038235 |
| 2 | Yangon | 6.967647 | 6.902941 | 7.195238 | 6.931429 | 7.157143 | 7.218919 |
In [408]:
#Giving errors need to melt this data so it is correct form to plot
DATA MELTING need to covert this table to correct format for plotting the graph¶
In [409]:
melt = pd.melt(rating_graph , id_vars='City' , var_name='Product line' , value_name='Average rating')
melt
Out[409]:
| City | Product line | Average rating | |
|---|---|---|---|
| 0 | Mandalay | Electronic accessories | 7.069231 |
| 1 | Naypyitaw | Electronic accessories | 6.887500 |
| 2 | Yangon | Electronic accessories | 6.967647 |
| 3 | Mandalay | Fashion accessories | 6.629268 |
| 4 | Naypyitaw | Fashion accessories | 7.217073 |
| 5 | Yangon | Fashion accessories | 6.902941 |
| 6 | Mandalay | Food and beverages | 6.897222 |
| 7 | Naypyitaw | Food and beverages | 7.140909 |
| 8 | Yangon | Food and beverages | 7.195238 |
| 9 | Mandalay | Health and beauty | 7.035294 |
| 10 | Naypyitaw | Health and beauty | 7.102778 |
| 11 | Yangon | Health and beauty | 6.931429 |
| 12 | Mandalay | Home and lifestyle | 6.802857 |
| 13 | Naypyitaw | Home and lifestyle | 7.077419 |
| 14 | Yangon | Home and lifestyle | 7.157143 |
| 15 | Mandalay | Sports and travel | 6.682051 |
| 16 | Naypyitaw | Sports and travel | 7.038235 |
| 17 | Yangon | Sports and travel | 7.218919 |
In [410]:
RG=px.bar(melt,x='City' , y='Average rating' , color='Product line' ,barmode='group', title='Average ratings of Product line' )
RG.show()
In [411]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0.5 1000 non-null int64 1 Unnamed: 0.4 1000 non-null int64 2 Unnamed: 0.3 1000 non-null int64 3 Unnamed: 0.2 1000 non-null int64 4 Unnamed: 0.1 1000 non-null int64 5 Unnamed: 0 1000 non-null int64 6 Invoice ID 1000 non-null object 7 Branch 1000 non-null object 8 City 1000 non-null object 9 Customer type 1000 non-null object 10 Gender 1000 non-null object 11 Product line 1000 non-null object 12 Unit price 1000 non-null float64 13 Quantity 1000 non-null int64 14 Tax 5% 1000 non-null float64 15 Total 1000 non-null float64 16 Time 1000 non-null object 17 Payment 1000 non-null object 18 cogs 1000 non-null float64 19 gross margin percentage 1000 non-null float64 20 gross income 1000 non-null float64 21 Rating 1000 non-null float64 22 Day 1000 non-null object 23 Date(dd-mm-yyyy) 1000 non-null object 24 Month 1000 non-null object dtypes: float64(7), int64(7), object(11) memory usage: 195.4+ KB
SALES BY CITY¶
In [412]:
data.groupby(['City' , 'Total']).size().reset_index()
Out[412]:
| City | Total | 0 | |
|---|---|---|---|
| 0 | Mandalay | 18.6375 | 1 |
| 1 | Mandalay | 20.1075 | 1 |
| 2 | Mandalay | 26.7225 | 1 |
| 3 | Mandalay | 28.4235 | 1 |
| 4 | Mandalay | 30.9960 | 1 |
| ... | ... | ... | ... |
| 992 | Yangon | 926.9505 | 1 |
| 993 | Yangon | 931.0350 | 1 |
| 994 | Yangon | 932.3370 | 1 |
| 995 | Yangon | 951.8250 | 1 |
| 996 | Yangon | 1039.2900 | 1 |
997 rows × 3 columns
In [413]:
sales=data.groupby('City' )['Total'].sum().reset_index()
sales
Out[413]:
| City | Total | |
|---|---|---|
| 0 | Mandalay | 106197.6720 |
| 1 | Naypyitaw | 110568.7065 |
| 2 | Yangon | 106200.3705 |
In [414]:
sales_graph= px.pie(sales , names='City' , values='Total' , title='TOTAL Sales by city' )
sales_graph.show()
CALCULATING CUSTOMER SATISFACTION INDEX¶
In [415]:
rating=data.groupby(['City' , 'Product line' ,'Rating']).size().reset_index(name='Count of ratings occured')
rating.head(50)
Out[415]:
| City | Product line | Rating | Count of ratings occured | |
|---|---|---|---|---|
| 0 | Mandalay | Electronic accessories | 4.0 | 1 |
| 1 | Mandalay | Electronic accessories | 4.2 | 1 |
| 2 | Mandalay | Electronic accessories | 4.3 | 1 |
| 3 | Mandalay | Electronic accessories | 4.8 | 1 |
| 4 | Mandalay | Electronic accessories | 4.9 | 1 |
| 5 | Mandalay | Electronic accessories | 5.1 | 1 |
| 6 | Mandalay | Electronic accessories | 5.3 | 1 |
| 7 | Mandalay | Electronic accessories | 5.5 | 1 |
| 8 | Mandalay | Electronic accessories | 5.6 | 1 |
| 9 | Mandalay | Electronic accessories | 5.7 | 1 |
| 10 | Mandalay | Electronic accessories | 5.8 | 1 |
| 11 | Mandalay | Electronic accessories | 6.0 | 3 |
| 12 | Mandalay | Electronic accessories | 6.1 | 3 |
| 13 | Mandalay | Electronic accessories | 6.2 | 2 |
| 14 | Mandalay | Electronic accessories | 6.3 | 1 |
| 15 | Mandalay | Electronic accessories | 6.5 | 3 |
| 16 | Mandalay | Electronic accessories | 6.6 | 1 |
| 17 | Mandalay | Electronic accessories | 6.7 | 2 |
| 18 | Mandalay | Electronic accessories | 6.8 | 1 |
| 19 | Mandalay | Electronic accessories | 7.0 | 1 |
| 20 | Mandalay | Electronic accessories | 7.1 | 1 |
| 21 | Mandalay | Electronic accessories | 7.3 | 1 |
| 22 | Mandalay | Electronic accessories | 7.6 | 3 |
| 23 | Mandalay | Electronic accessories | 7.7 | 1 |
| 24 | Mandalay | Electronic accessories | 7.8 | 2 |
| 25 | Mandalay | Electronic accessories | 7.9 | 1 |
| 26 | Mandalay | Electronic accessories | 8.0 | 2 |
| 27 | Mandalay | Electronic accessories | 8.1 | 2 |
| 28 | Mandalay | Electronic accessories | 8.2 | 1 |
| 29 | Mandalay | Electronic accessories | 8.6 | 2 |
| 30 | Mandalay | Electronic accessories | 8.7 | 1 |
| 31 | Mandalay | Electronic accessories | 8.8 | 1 |
| 32 | Mandalay | Electronic accessories | 8.9 | 2 |
| 33 | Mandalay | Electronic accessories | 9.0 | 2 |
| 34 | Mandalay | Electronic accessories | 9.4 | 1 |
| 35 | Mandalay | Electronic accessories | 9.5 | 1 |
| 36 | Mandalay | Electronic accessories | 9.8 | 1 |
| 37 | Mandalay | Electronic accessories | 9.9 | 1 |
| 38 | Mandalay | Electronic accessories | 10.0 | 1 |
| 39 | Mandalay | Fashion accessories | 4.1 | 2 |
| 40 | Mandalay | Fashion accessories | 4.2 | 1 |
| 41 | Mandalay | Fashion accessories | 4.3 | 1 |
| 42 | Mandalay | Fashion accessories | 4.4 | 1 |
| 43 | Mandalay | Fashion accessories | 4.5 | 1 |
| 44 | Mandalay | Fashion accessories | 4.6 | 2 |
| 45 | Mandalay | Fashion accessories | 4.7 | 1 |
| 46 | Mandalay | Fashion accessories | 4.8 | 2 |
| 47 | Mandalay | Fashion accessories | 4.9 | 1 |
| 48 | Mandalay | Fashion accessories | 5.0 | 2 |
| 49 | Mandalay | Fashion accessories | 5.1 | 1 |
In [416]:
max_rating=rating['Rating'].max() # maximum rating is 10
max_rating
Out[416]:
np.float64(10.0)
In [417]:
rating['Count of ratings occured'].max()
Out[417]:
np.int64(5)
calculation of csi for each product line in the city¶
In [418]:
Q=rating.query("`Product line` == 'Electronic accessories' and City == 'Mandalay'")
Q
Out[418]:
| City | Product line | Rating | Count of ratings occured | |
|---|---|---|---|---|
| 0 | Mandalay | Electronic accessories | 4.0 | 1 |
| 1 | Mandalay | Electronic accessories | 4.2 | 1 |
| 2 | Mandalay | Electronic accessories | 4.3 | 1 |
| 3 | Mandalay | Electronic accessories | 4.8 | 1 |
| 4 | Mandalay | Electronic accessories | 4.9 | 1 |
| 5 | Mandalay | Electronic accessories | 5.1 | 1 |
| 6 | Mandalay | Electronic accessories | 5.3 | 1 |
| 7 | Mandalay | Electronic accessories | 5.5 | 1 |
| 8 | Mandalay | Electronic accessories | 5.6 | 1 |
| 9 | Mandalay | Electronic accessories | 5.7 | 1 |
| 10 | Mandalay | Electronic accessories | 5.8 | 1 |
| 11 | Mandalay | Electronic accessories | 6.0 | 3 |
| 12 | Mandalay | Electronic accessories | 6.1 | 3 |
| 13 | Mandalay | Electronic accessories | 6.2 | 2 |
| 14 | Mandalay | Electronic accessories | 6.3 | 1 |
| 15 | Mandalay | Electronic accessories | 6.5 | 3 |
| 16 | Mandalay | Electronic accessories | 6.6 | 1 |
| 17 | Mandalay | Electronic accessories | 6.7 | 2 |
| 18 | Mandalay | Electronic accessories | 6.8 | 1 |
| 19 | Mandalay | Electronic accessories | 7.0 | 1 |
| 20 | Mandalay | Electronic accessories | 7.1 | 1 |
| 21 | Mandalay | Electronic accessories | 7.3 | 1 |
| 22 | Mandalay | Electronic accessories | 7.6 | 3 |
| 23 | Mandalay | Electronic accessories | 7.7 | 1 |
| 24 | Mandalay | Electronic accessories | 7.8 | 2 |
| 25 | Mandalay | Electronic accessories | 7.9 | 1 |
| 26 | Mandalay | Electronic accessories | 8.0 | 2 |
| 27 | Mandalay | Electronic accessories | 8.1 | 2 |
| 28 | Mandalay | Electronic accessories | 8.2 | 1 |
| 29 | Mandalay | Electronic accessories | 8.6 | 2 |
| 30 | Mandalay | Electronic accessories | 8.7 | 1 |
| 31 | Mandalay | Electronic accessories | 8.8 | 1 |
| 32 | Mandalay | Electronic accessories | 8.9 | 2 |
| 33 | Mandalay | Electronic accessories | 9.0 | 2 |
| 34 | Mandalay | Electronic accessories | 9.4 | 1 |
| 35 | Mandalay | Electronic accessories | 9.5 | 1 |
| 36 | Mandalay | Electronic accessories | 9.8 | 1 |
| 37 | Mandalay | Electronic accessories | 9.9 | 1 |
| 38 | Mandalay | Electronic accessories | 10.0 | 1 |
In [419]:
Q.groupby('City')['Rating'].mean().reset_index()
Out[419]:
| City | Rating | |
|---|---|---|
| 0 | Mandalay | 7.069231 |
formual of CSI sum of ratings/total number of ratings *100/Max rating¶
In [420]:
total_rating=Q['Count of ratings occured'].sum()
total_rating
Out[420]:
np.int64(55)
In [421]:
CSI_electronics=(Q['Rating']*Q['Count of ratings occured']).sum()/total_rating/max_rating*100
CSI_electronics
Out[421]:
np.float64(71.16363636363636)
In [422]:
cities=rating['City'].unique()
cities
Out[422]:
array(['Mandalay', 'Naypyitaw', 'Yangon'], dtype=object)
In [423]:
product_lines = rating["Product line"].unique()
cities=rating['City'].unique()
CSI_DATA= []
for city in cities:
for line in product_lines:
Q = rating.query("`Product line` == @line and `City` == @city")
total_rating = Q["Count of ratings occured"].sum()
weighted_sum = (Q["Rating"] * Q["Count of ratings occured"]).sum()
max_rating = 10
csi = (weighted_sum / total_rating) / max_rating * 100
print(f"{city}: {line}: {csi:.2f}%")
CSI_DATA.append({
"City" : city,
"Product line": line,
"CSI(%)":round(csi,2)})
CSI_DF = pd.DataFrame(CSI_DATA)
Mandalay: Electronic accessories: 71.16% Mandalay: Fashion accessories: 67.23% Mandalay: Food and beverages: 69.94% Mandalay: Health and beauty: 71.00% Mandalay: Home and lifestyle: 65.16% Mandalay: Sports and travel: 65.10% Naypyitaw: Electronic accessories: 67.47% Naypyitaw: Fashion accessories: 74.40% Naypyitaw: Food and beverages: 70.80% Naypyitaw: Health and beauty: 69.98% Naypyitaw: Home and lifestyle: 70.60% Naypyitaw: Sports and travel: 70.29% Yangon: Electronic accessories: 69.12% Yangon: Fashion accessories: 68.78% Yangon: Food and beverages: 72.53% Yangon: Health and beauty: 69.00% Yangon: Home and lifestyle: 69.31% Yangon: Sports and travel: 72.58%
In [424]:
CSI_DF
Out[424]:
| City | Product line | CSI(%) | |
|---|---|---|---|
| 0 | Mandalay | Electronic accessories | 71.16 |
| 1 | Mandalay | Fashion accessories | 67.23 |
| 2 | Mandalay | Food and beverages | 69.94 |
| 3 | Mandalay | Health and beauty | 71.00 |
| 4 | Mandalay | Home and lifestyle | 65.16 |
| 5 | Mandalay | Sports and travel | 65.10 |
| 6 | Naypyitaw | Electronic accessories | 67.47 |
| 7 | Naypyitaw | Fashion accessories | 74.40 |
| 8 | Naypyitaw | Food and beverages | 70.80 |
| 9 | Naypyitaw | Health and beauty | 69.98 |
| 10 | Naypyitaw | Home and lifestyle | 70.60 |
| 11 | Naypyitaw | Sports and travel | 70.29 |
| 12 | Yangon | Electronic accessories | 69.12 |
| 13 | Yangon | Fashion accessories | 68.78 |
| 14 | Yangon | Food and beverages | 72.53 |
| 15 | Yangon | Health and beauty | 69.00 |
| 16 | Yangon | Home and lifestyle | 69.31 |
| 17 | Yangon | Sports and travel | 72.58 |
In [425]:
m_graph=px.bar(CSI_DF, y='CSI(%)' , x='City' , title='CSI OF CITY' ,barmode='group', color='Product line',width=500, height=400)
m_graph.show()
SALES PREDICTION MODEL FOR EACH CITY ACCORDING TO PRODUCT LINE¶
In [426]:
sg_data=data.groupby(['City' ,'Product line' ])['Total'].sum().reset_index(name='Total Revenue')
sg_data
Out[426]:
| City | Product line | Total Revenue | |
|---|---|---|---|
| 0 | Mandalay | Electronic accessories | 17051.4435 |
| 1 | Mandalay | Fashion accessories | 16413.3165 |
| 2 | Mandalay | Food and beverages | 15214.8885 |
| 3 | Mandalay | Health and beauty | 19980.6600 |
| 4 | Mandalay | Home and lifestyle | 17549.1645 |
| 5 | Mandalay | Sports and travel | 19988.1990 |
| 6 | Naypyitaw | Electronic accessories | 18968.9745 |
| 7 | Naypyitaw | Fashion accessories | 21560.0700 |
| 8 | Naypyitaw | Food and beverages | 23766.8550 |
| 9 | Naypyitaw | Health and beauty | 16615.3260 |
| 10 | Naypyitaw | Home and lifestyle | 13895.5530 |
| 11 | Naypyitaw | Sports and travel | 15761.9280 |
| 12 | Yangon | Electronic accessories | 18317.1135 |
| 13 | Yangon | Fashion accessories | 16332.5085 |
| 14 | Yangon | Food and beverages | 17163.1005 |
| 15 | Yangon | Health and beauty | 12597.7530 |
| 16 | Yangon | Home and lifestyle | 22417.1955 |
| 17 | Yangon | Sports and travel | 19372.6995 |
In [427]:
sg=px.bar(sg_data , x='Product line' ,y='Total Revenue' , facet_row='City', color='Product line', height=1200,
title='TOTAL REVENUE OF EACH PRODUCT LINE IN DIFFERENT CITIES' )
sg.update_xaxes(showticklabels=True)
sg.show()
In [428]:
Month_sales=data.groupby(['City','Month' ,'Product line' ])['Total'].sum().reset_index()
Month_sales
Out[428]:
| City | Month | Product line | Total | |
|---|---|---|---|---|
| 0 | Mandalay | February | Electronic accessories | 6686.2530 |
| 1 | Mandalay | February | Fashion accessories | 6137.1135 |
| 2 | Mandalay | February | Food and beverages | 5554.8150 |
| 3 | Mandalay | February | Health and beauty | 5856.4275 |
| 4 | Mandalay | February | Home and lifestyle | 4659.8475 |
| 5 | Mandalay | February | Sports and travel | 5529.8145 |
| 6 | Mandalay | January | Electronic accessories | 6699.7770 |
| 7 | Mandalay | January | Fashion accessories | 6112.5960 |
| 8 | Mandalay | January | Food and beverages | 6609.2775 |
| 9 | Mandalay | January | Health and beauty | 6399.8865 |
| 10 | Mandalay | January | Home and lifestyle | 4586.4420 |
| 11 | Mandalay | January | Sports and travel | 6768.0795 |
| 12 | Mandalay | March | Electronic accessories | 3665.4135 |
| 13 | Mandalay | March | Fashion accessories | 4163.6070 |
| 14 | Mandalay | March | Food and beverages | 3050.7960 |
| 15 | Mandalay | March | Health and beauty | 7724.3460 |
| 16 | Mandalay | March | Home and lifestyle | 8302.8750 |
| 17 | Mandalay | March | Sports and travel | 7690.3050 |
| 18 | Naypyitaw | February | Electronic accessories | 5473.8810 |
| 19 | Naypyitaw | February | Fashion accessories | 7699.1145 |
| 20 | Naypyitaw | February | Food and beverages | 7391.3175 |
| 21 | Naypyitaw | February | Health and beauty | 5830.3455 |
| 22 | Naypyitaw | February | Home and lifestyle | 3002.9055 |
| 23 | Naypyitaw | February | Sports and travel | 3537.4185 |
| 24 | Naypyitaw | January | Electronic accessories | 5730.2385 |
| 25 | Naypyitaw | January | Fashion accessories | 6385.0290 |
| 26 | Naypyitaw | January | Food and beverages | 8315.0235 |
| 27 | Naypyitaw | January | Health and beauty | 6020.6895 |
| 28 | Naypyitaw | January | Home and lifestyle | 5594.7045 |
| 29 | Naypyitaw | January | Sports and travel | 8388.9960 |
| 30 | Naypyitaw | March | Electronic accessories | 7764.8550 |
| 31 | Naypyitaw | March | Fashion accessories | 7475.9265 |
| 32 | Naypyitaw | March | Food and beverages | 8060.5140 |
| 33 | Naypyitaw | March | Health and beauty | 4764.2910 |
| 34 | Naypyitaw | March | Home and lifestyle | 5297.9430 |
| 35 | Naypyitaw | March | Sports and travel | 3835.5135 |
| 36 | Yangon | February | Electronic accessories | 5202.7710 |
| 37 | Yangon | February | Fashion accessories | 5173.6335 |
| 38 | Yangon | February | Food and beverages | 7054.2255 |
| 39 | Yangon | February | Health and beauty | 2915.4825 |
| 40 | Yangon | February | Home and lifestyle | 4771.6305 |
| 41 | Yangon | February | Sports and travel | 4742.3775 |
| 42 | Yangon | January | Electronic accessories | 6401.2725 |
| 43 | Yangon | January | Fashion accessories | 6847.4910 |
| 44 | Yangon | January | Food and beverages | 4646.2290 |
| 45 | Yangon | January | Health and beauty | 3962.5950 |
| 46 | Yangon | January | Home and lifestyle | 10313.5935 |
| 47 | Yangon | January | Sports and travel | 6509.9475 |
| 48 | Yangon | March | Electronic accessories | 6713.0700 |
| 49 | Yangon | March | Fashion accessories | 4311.3840 |
| 50 | Yangon | March | Food and beverages | 5462.6460 |
| 51 | Yangon | March | Health and beauty | 5719.6755 |
| 52 | Yangon | March | Home and lifestyle | 7331.9715 |
| 53 | Yangon | March | Sports and travel | 8120.3745 |
In [429]:
Mg=px.bar(Month_sales, x="Month", y="Total", color="Product line" ,facet_row="City", barmode='group', category_orders={"Month":['January' ,'February' , 'March']},
title="MONTHLY SALES OF PRODUCT LINE" ,height=1200)
Mg.update_xaxes(showticklabels=True)
Mg.show()
In [430]:
# Month_sales=data.groupby(['City','Month' ,'Product line' ])['Total'].sum()
# Month_sales
In [431]:
Month_sales.groupby(['City','Month'])['Total'].sum()
Out[431]:
City Month
Mandalay February 34424.2710
January 37176.0585
March 34597.3425
Naypyitaw February 32934.9825
January 40434.6810
March 37199.0430
Yangon February 29860.1205
January 38681.1285
March 37659.1215
Name: Total, dtype: float64
In [432]:
sales_df=Month_sales.groupby(['City','Month'])['Total'].sum().reset_index(name="Revenue")
sales_df
Out[432]:
| City | Month | Revenue | |
|---|---|---|---|
| 0 | Mandalay | February | 34424.2710 |
| 1 | Mandalay | January | 37176.0585 |
| 2 | Mandalay | March | 34597.3425 |
| 3 | Naypyitaw | February | 32934.9825 |
| 4 | Naypyitaw | January | 40434.6810 |
| 5 | Naypyitaw | March | 37199.0430 |
| 6 | Yangon | February | 29860.1205 |
| 7 | Yangon | January | 38681.1285 |
| 8 | Yangon | March | 37659.1215 |
In [433]:
sales_df['Month']=pd.Categorical(sales_df['Month'] ,categories=['January' , 'February' ,'March'] , ordered =True)
In [434]:
final_sales=sales_df.groupby(['City','Month','Revenue'] ,observed=True).size().reset_index()
final_sales
Out[434]:
| City | Month | Revenue | 0 | |
|---|---|---|---|---|
| 0 | Mandalay | January | 37176.0585 | 1 |
| 1 | Mandalay | February | 34424.2710 | 1 |
| 2 | Mandalay | March | 34597.3425 | 1 |
| 3 | Naypyitaw | January | 40434.6810 | 1 |
| 4 | Naypyitaw | February | 32934.9825 | 1 |
| 5 | Naypyitaw | March | 37199.0430 | 1 |
| 6 | Yangon | January | 38681.1285 | 1 |
| 7 | Yangon | February | 29860.1205 | 1 |
| 8 | Yangon | March | 37659.1215 | 1 |
In [435]:
final_sales=sales_df.groupby(['City','Month','Revenue'] ,observed=True).size().reset_index().drop(columns=0)
final_sales
Out[435]:
| City | Month | Revenue | |
|---|---|---|---|
| 0 | Mandalay | January | 37176.0585 |
| 1 | Mandalay | February | 34424.2710 |
| 2 | Mandalay | March | 34597.3425 |
| 3 | Naypyitaw | January | 40434.6810 |
| 4 | Naypyitaw | February | 32934.9825 |
| 5 | Naypyitaw | March | 37199.0430 |
| 6 | Yangon | January | 38681.1285 |
| 7 | Yangon | February | 29860.1205 |
| 8 | Yangon | March | 37659.1215 |
In [436]:
sg2=px.line(final_sales,y='Revenue' , x='Month' ,title='MONTHLY REVENUE OF CITIES' ,
markers=True, height=600, color='City')
sg2.update_xaxes(showticklabels=True)
sg2.show()
REVENUE PREDICTION FOR APRIL MONTH¶
In [437]:
final_sales
Out[437]:
| City | Month | Revenue | |
|---|---|---|---|
| 0 | Mandalay | January | 37176.0585 |
| 1 | Mandalay | February | 34424.2710 |
| 2 | Mandalay | March | 34597.3425 |
| 3 | Naypyitaw | January | 40434.6810 |
| 4 | Naypyitaw | February | 32934.9825 |
| 5 | Naypyitaw | March | 37199.0430 |
| 6 | Yangon | January | 38681.1285 |
| 7 | Yangon | February | 29860.1205 |
| 8 | Yangon | March | 37659.1215 |
In [438]:
pvt_table=final_sales.pivot_table(index='City' , columns='Month' , values='Revenue' , observed=True)
pvt_table
Out[438]:
| Month | January | February | March |
|---|---|---|---|
| City | |||
| Mandalay | 37176.0585 | 34424.2710 | 34597.3425 |
| Naypyitaw | 40434.6810 | 32934.9825 | 37199.0430 |
| Yangon | 38681.1285 | 29860.1205 | 37659.1215 |
In [447]:
# Step 1: Define training data (to predict March from Jan & Feb)
X_train = pvt_table[['January', 'February']]
y_train = pvt_table['March']
# Step 2: Train the model
model = LinearRegression()
model.fit(X_train, y_train)
# Step 3: Prepare data to predict April (using Feb & Mar)
X_april = pvt_table[['February', 'March']].copy()
# Step 4: Rename columns to match the training feature names
X_april.columns = ['January', 'February'] # Rename to match X_train
# Step 5: Predict April values
pvt_table['April_Prediction'] = model.predict(X_april)
# Step 6: View result
pvt_table[['January', 'February', 'March', 'April_Prediction']]
Out[447]:
| Month | January | February | March | April_Prediction |
|---|---|---|---|---|
| 0 | 37176.0585 | 34424.2710 | 34597.3425 | 32920.760680 |
| 1 | 40434.6810 | 32934.9825 | 37199.0430 | 30809.846084 |
| 2 | 38681.1285 | 29860.1205 | 37659.1215 | 28808.448681 |
In [448]:
pvt_table = pvt_table.reset_index()
pvt_table
Out[448]:
| Month | index | City | January | February | March | April_Prediction |
|---|---|---|---|---|---|---|
| 0 | 0 | Mandalay | 37176.0585 | 34424.2710 | 34597.3425 | 32920.760680 |
| 1 | 1 | Naypyitaw | 40434.6810 | 32934.9825 | 37199.0430 | 30809.846084 |
| 2 | 2 | Yangon | 38681.1285 | 29860.1205 | 37659.1215 | 28808.448681 |
In [449]:
pvt_table[['City', 'January', 'February', 'March']]
Out[449]:
| Month | City | January | February | March |
|---|---|---|---|---|
| 0 | Mandalay | 37176.0585 | 34424.2710 | 34597.3425 |
| 1 | Naypyitaw | 40434.6810 | 32934.9825 | 37199.0430 |
| 2 | Yangon | 38681.1285 | 29860.1205 | 37659.1215 |
In [450]:
sg_long = pd.melt(
pvt_table,
id_vars=['City'],
value_vars=['January', 'February', 'March', 'April_Prediction'],
var_name='Month',
value_name='Revenue'
)
sg_long
Out[450]:
| City | Month | Revenue | |
|---|---|---|---|
| 0 | Mandalay | January | 37176.058500 |
| 1 | Naypyitaw | January | 40434.681000 |
| 2 | Yangon | January | 38681.128500 |
| 3 | Mandalay | February | 34424.271000 |
| 4 | Naypyitaw | February | 32934.982500 |
| 5 | Yangon | February | 29860.120500 |
| 6 | Mandalay | March | 34597.342500 |
| 7 | Naypyitaw | March | 37199.043000 |
| 8 | Yangon | March | 37659.121500 |
| 9 | Mandalay | April_Prediction | 32920.760680 |
| 10 | Naypyitaw | April_Prediction | 30809.846084 |
| 11 | Yangon | April_Prediction | 28808.448681 |
In [451]:
sg_predict_graph=px.line(sg_long , x='Month' ,y='Revenue' ,color='City' , markers=True)
sg_predict_graph.show()
NOW I'LL BE PUTTING ALL OF HE GRAPHS IN THE FORM OF DASBOARD¶
BY USING STREAMLIT AND GITHUB¶
In [ ]: